Introduction

subdiagnosis <- readr::read_tsv(
  file.path("..", "..", "..", "data", "current", params$scpca_project_id, "single_cell_metadata.tsv"),
  show_col_types = FALSE
  ) |>
  dplyr::filter(scpca_sample_id == params$sample_id) |>
  dplyr::pull(subdiagnosis)

This notebook explores using CopyKAT to estimate tumor and normal cells in SCPCS000208 from SCPCP000006. This sample has a(n) Anaplastic subdiagnosis.

CopyKAT was run using the copyKAT.R script with and without a normal reference.

These results are read into this notebook and used to:

  • Visualize diploid and aneuploid cells on the UMAP.
  • Evaluate common copy number gains and losses in Wilms tumor.
  • Calculate the confusion matrix comparing manual annotations of tumor cells to using CopyKAT to annotate tumor cells.
  • Compare the annotations from CopyKAT to cell type annotations using label transfer and the fetal (kidney) references.

Packages

library(Seurat)
library(SCpubr)             # for plotting 
library(tidyverse)
library(patchwork)

set.seed(params$seed)

Base directories

# The base path for the OpenScPCA repository, found by its (hidden) .git directory
repository_base <- rprojroot::find_root(rprojroot::is_git_root)

# The current data directory, found within the repository base directory
data_dir <- file.path(repository_base, "data", "current", params$scpca_project_id)

# The path to this module
module_base <- file.path(repository_base, "analyses", "cell-type-wilms-tumor-06")

Input files

In this notebook, we are working with the Wilms tumor sample defined in SCPCS000208 from the Wilms tumor dataset SCPCP000006. We work with the pre-processed and labeled Seurat object that is the output of 02b_label-transfer_fetal_kidney_reference_Stewart.Rmd saved in the results directory.

result_dir <- file.path(module_base, "results", params$sample_id)

# copykat results
copykat_output_obj_noref <- file.path(result_dir, "copyKAT", "noref", "05_final-copykat.rds")
copykat_output_obj_ref <- file.path(result_dir, "copyKAT", "noref", "05_final-copykat.rds")

copykat_objs <- c(
  no_ref = copykat_output_obj_noref,
  with_ref = copykat_output_obj_ref
) |>
  purrr::map(readr::read_rds)

# predictions files 
predictions_file <- glue::glue("{params$sample_id}_copykat_prediction.txt")

predictions_paths <- c(
  no_ref = file.path(result_dir, "copyKAT", "noref", predictions_file),
  with_ref = file.path(result_dir, "copyKAT", "ref", predictions_file)
)

# full results gene by cell 
full_ck_result_file <- glue::glue("{params$sample_id}_copykat_CNA_results.txt")
  
full_ck_result_paths <- c(
  no_ref = file.path(result_dir, "copyKAT", "noref", full_ck_result_file),
  with_ref = file.path(result_dir, "copyKAT", "ref", full_ck_result_file)
)

Output file

Reports will be saved in the notebook directory. The pre-processed and annotated Seurat object per samples are saved in the result folder.

Functions

Here we defined function that will be used multiple time all along the notebook.

Jaccard functions

These functions are taken directly from the supplemental cell type report produced in scpca-nf

Function to calculate Jaccard similarity on two vectors
  • vec1 First vector
  • vec2 Second vector

return Jaccard similarity between the vectors

jaccard <- function(vec1, vec2) {
  length(intersect(vec1, vec2)) / length(union(vec1, vec2))
}
Wrapper function to calculate jaccard similarity matrix for two categorical variables
  • celltype_df The celltype_df data frame which must contain these columns: colname1, colname2, and barcodes

  • colname1 Column name, as a string, of first categorical variable of interest

  • colname2 Column name, as a string, of second categorical variable of interest

return Jaccard similarity matrix for the two columns. colname1 values will be row names and colname2 values will be column names in the final matrix

make_jaccard_matrix <- function(celltype_df, colname1, colname2) {
  # make lists of barcodes for each category, named by the category
  id1_list <- split(celltype_df$barcodes, celltype_df[[colname1]])
  id2_list <- split(celltype_df$barcodes, celltype_df[[colname2]])

  # create the grid of comparisons
  cross_df <- tidyr::expand_grid(id1 = names(id1_list), id2 = names(id2_list))

  # calculate a single Jaccard index for each combination using split lists & ids
  jaccard_scores <- cross_df |>
    purrr::pmap_dbl(\(id1, id2){
      jaccard(id1_list[[id1]], id2_list[[id2]])
    })

  # add scores to the comparison grid and convert to matrix
  jaccard_matrix <- cross_df |>
    dplyr::mutate(jaccard = jaccard_scores) |>
    # convert to matrix
    tidyr::pivot_wider(
      names_from = "id2",
      values_from = "jaccard"
    ) |>
    tibble::column_to_rownames(var = "id1") |>
    as.matrix()

  return(jaccard_matrix)
}
function that turns jaccard matrices into a list of heatmaps
make_heatmap_list <- function(jaccard_matrices, column_title, legend_match, cluster_rows = TRUE) {
  # Set heatmap padding option
  heatmap_padding <- 0.2
  ComplexHeatmap::ht_opt(TITLE_PADDING = grid::unit(heatmap_padding, "in"))
  # list of heatmaps looking at SingleR/ CellAssign vs tumor/normal
  heatmap <- jaccard_matrices |>
    purrr::imap(
      \(celltype_mat, celltype_method) {
        ComplexHeatmap::Heatmap(
          t(celltype_mat), # transpose because matrix rows are in common & we want a vertical arrangement
          col = circlize::colorRamp2(c(0, 1), colors = c("white", "darkslateblue")),
          border = TRUE,
          ## Row parameters
          cluster_rows = cluster_rows,
          row_title = celltype_method,
          row_title_gp = grid::gpar(fontsize = 12),
          row_title_side = "left",
          row_names_side = "left",
          row_dend_side = "right",
          row_names_gp = grid::gpar(fontsize = 10),
          ## Column parameters
          cluster_columns = TRUE,
          column_title = column_title,
          column_title_gp = grid::gpar(fontsize = 12),
          column_names_side = "bottom",
          column_names_gp = grid::gpar(fontsize = 10),
          column_names_rot = 90,
          ## Legend parameters
          heatmap_legend_param = list(
            title = "Jaccard index",
            direction = "vertical",
            legend_width = unit(1.5, "in")
          ),
          show_heatmap_legend = celltype_method == legend_match,
        )
      }
    ) |>
    # concatenate vertically into HeatmapList object
    purrr::reduce(ComplexHeatmap::`%v%`)

  return(heatmap)
}
function to plot jaccard matrices to compare one set of annotations to other methods
  • annotation_column will be the columns and will be compared to each method listed in methods_to_compare
plot_jaccard <- function(classification_df,
                         annotation_column, # single column of annotations to compare to all methods
                         methods_to_compare, # list of methods to compare to annotations
                         column_title, # title for columns
                         legend_match, # what legend to keep, should be a name in `methods_to_compare`
                         cluster_rows = TRUE # whether or not to cluster the rows of the heatmap
) {
  # create jaccard matrices
  jaccard_matrices <- methods_to_compare |>
    purrr::map(\(name) {
      make_jaccard_matrix(
        classification_df,
        annotation_column,
        name
      )
    })

  heatmap <- make_heatmap_list(jaccard_matrices, column_title, legend_match, cluster_rows)

  return(heatmap)
}

Analysis

Load the pre-processed Seurat object

# open the processed rds object
srat <- readRDS(file.path(result_dir, paste0("02b-fetal_kidney_label-transfer_", params$sample_id,".Rds")))
DefaultAssay(srat) <- "SCT"

CopyKAT results

Below we look at the heatmaps produced by CopyKAT.

Heatmap without reference

Heatmap with endothelial cells as reference

UMAP

Below we prepare and plot a UMAP that shows which cells are classified as diploid, aneuploid, and not defined by CopyKAT. We show a side by side UMAP with results from running CopyKAT both with and without a reference of normal cells.

# read in ck predictions from both reference types (no_normal and with_normal)
ck_results_df <- predictions_paths |> 
  purrr::map(readr::read_tsv) |> 
  dplyr::bind_rows(.id = "reference_used")

# get umap coordinate
umap_df <- srat[["umap"]]@cell.embeddings |>
  as.data.frame() |>
  rownames_to_column('barcodes')

cnv_df <- umap_df |>
   dplyr::left_join(ck_results_df, by = c("barcodes" = "cell.names")) 
ggplot(cnv_df, aes(x = umap_1, y = umap_2, color = copykat.pred)) +
  geom_point(alpha = 0.5, size = 0.5) +
  theme_bw() +
  facet_wrap(vars(reference_used))

Validate common CNAs found in Wilms tumor

To validate some of these annotations, we can also look at some commonly found copy number variations in Wilms tumor patients:

  • Loss of Chr1p
  • Gain of Chr1q
  • Loss of Chr11p13
  • Loss of Chr11p15
  • Loss of Chr16q

Although these are the most frequent, there are patients who do not have any of these alterations and patients that only have some of these alterations. See Tirode et al., and Crompton et al..

CopyKAT outputs a matrix that contains the estimated copy numbers for each gene in each cell. We can read that in and look at the mean estimated copy numbers for each chromosome across each cell. We might expect that tumor cells would show an increased estimated copy number in Chr1q, and/or a loss of Chr1p, Chr11p and Chr16q.

# read in full gene by cell copy number detection results
full_ck_results_df <- full_ck_result_paths |>
  purrr::map(readr::read_tsv) |>
  dplyr::bind_rows(.id = "reference_used")

# for every cell, calculate the mean detection level across all genes in a given chromosome
full_cnv_df <- full_ck_results_df |>
  tidyr::pivot_longer(
    cols = -c(
      reference_used,
      chrom
    ),
    names_to = "barcodes",
    values_to = "cnv_detection"
  ) |>
  dplyr::group_by(chrom, barcodes, reference_used) |>
  dplyr::summarise(mean_cnv_detection = mean(cnv_detection))

# join with cnv info
cnv_df <- cnv_df |>
  dplyr::left_join(full_cnv_df, by = c("barcodes", "reference_used")) |>
  dplyr::filter(!is.na(chrom))

Let’s look at the distribution of CNV estimation in cells that are called aneuploid and diploid by CopyKAT.

# create faceted density plots showing estimation of CNV detection across each chr of interest
# colored by aneuploid/diploid estimation
ggplot(cnv_df, aes(x = mean_cnv_detection, color = copykat.pred)) +
  geom_density() +
  theme_bw() +
  facet_grid(
    rows = vars(chrom),
    cols = vars(reference_used)
  )

Conclusions

Session Info

# record the versions of the packages used in this analysis and other environment information
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8     LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                  LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/Vienna
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggalluvial_0.12.5      org.Hs.eg.db_3.19.1    AnnotationDbi_1.66.0   IRanges_2.38.1         S4Vectors_0.42.1       Biobase_2.64.0         BiocGenerics_0.50.0    clusterProfiler_4.12.6
##  [9] enrichplot_1.24.4      msigdbr_7.5.1          patchwork_1.3.0        lubridate_1.9.3        forcats_1.0.0          stringr_1.5.1          dplyr_1.1.4            purrr_1.0.2           
## [17] readr_2.1.5            tidyr_1.3.1            tibble_3.2.1           ggplot2_3.5.1          tidyverse_2.0.0        SCpubr_2.0.2           sctransform_0.4.1      Seurat_5.1.0          
## [25] SeuratObject_5.0.2     sp_2.1-4               optparse_1.7.5         Matrix_1.7-0          
## 
## loaded via a namespace (and not attached):
##   [1] RcppAnnoy_0.0.22        splines_4.4.1           later_1.3.2             ggplotify_0.1.2         R.oo_1.26.0             polyclip_1.10-7         fastDummies_1.7.4      
##   [8] lifecycle_1.0.4         httr2_1.0.5             rprojroot_2.0.4         globals_0.16.3          lattice_0.22-6          vroom_1.6.5             MASS_7.3-61            
##  [15] magrittr_2.0.3          sass_0.4.9              plotly_4.10.4           rmarkdown_2.28          jquerylib_0.1.4         yaml_2.3.10             httpuv_1.6.15          
##  [22] spam_2.10-0             spatstat.sparse_3.1-0   reticulate_1.39.0       cowplot_1.1.3           pbapply_1.7-2           DBI_1.2.3               RColorBrewer_1.1-3     
##  [29] abind_1.4-8             zlibbioc_1.50.0         Rtsne_0.17              R.utils_2.12.3          ggraph_2.2.1            yulab.utils_0.1.7       tweenr_2.0.3           
##  [36] rappdirs_0.3.3          GenomeInfoDbData_1.2.12 ggrepel_0.9.6           irlba_2.3.5.1           listenv_0.9.1           spatstat.utils_3.1-0    tidytree_0.4.6         
##  [43] goftest_1.2-3           RSpectra_0.16-2         spatstat.random_3.3-2   fitdistrplus_1.2-1      parallelly_1.38.0       leiden_0.4.3.1          codetools_0.2-20       
##  [50] getopt_1.20.4           DOSE_3.30.5             ggforce_0.4.2           DT_0.33                 tidyselect_1.2.1        aplot_0.2.3             UCSC.utils_1.0.0       
##  [57] farver_2.1.2            viridis_0.6.5           matrixStats_1.4.1       spatstat.explore_3.3-2  jsonlite_1.8.9          tidygraph_1.3.1         progressr_0.14.0       
##  [64] ggridges_0.5.6          survival_3.7-0          tools_4.4.1             treeio_1.28.0           ica_1.0-3               Rcpp_1.0.13             glue_1.8.0             
##  [71] gridExtra_2.3           xfun_0.47               qvalue_2.36.0           GenomeInfoDb_1.40.1     withr_3.0.1             fastmap_1.2.0           fansi_1.0.6            
##  [78] digest_0.6.37           gridGraphics_0.5-1      timechange_0.3.0        R6_2.5.1                mime_0.12               colorspace_2.1-1        scattermore_1.2        
##  [85] GO.db_3.19.1            tensor_1.5              spatstat.data_3.1-2     RSQLite_2.3.7           R.methodsS3_1.8.2       utf8_1.2.4              generics_0.1.3         
##  [92] data.table_1.16.0       graphlayouts_1.2.0      httr_1.4.7              htmlwidgets_1.6.4       scatterpie_0.2.4        uwot_0.2.2              pkgconfig_2.0.3        
##  [99] gtable_0.3.5            blob_1.2.4              lmtest_0.9-40           XVector_0.44.0          shadowtext_0.1.4        htmltools_0.5.8.1       fgsea_1.30.0           
## [106] dotCall64_1.1-1         scales_1.3.0            png_0.1-8               spatstat.univar_3.0-1   ggfun_0.1.6             knitr_1.48              rstudioapi_0.16.0      
## [113] tzdb_0.4.0              reshape2_1.4.4          nlme_3.1-166            cachem_1.1.0            zoo_1.8-12              KernSmooth_2.23-24      parallel_4.4.1         
## [120] miniUI_0.1.1.1          pillar_1.9.0            grid_4.4.1              vctrs_0.6.5             RANN_2.6.2              promises_1.3.0          xtable_1.8-4           
## [127] cluster_2.1.6           evaluate_1.0.0          cli_3.6.3               compiler_4.4.1          rlang_1.1.4             crayon_1.5.3            future.apply_1.11.2    
## [134] labeling_0.4.3          fs_1.6.4                plyr_1.8.9              stringi_1.8.4           BiocParallel_1.38.0     viridisLite_0.4.2       deldir_2.0-4           
## [141] babelgene_22.9          munsell_0.5.1           Biostrings_2.72.1       lazyeval_0.2.2          spatstat.geom_3.3-3     GOSemSim_2.30.2         RcppHNSW_0.6.0         
## [148] hms_1.1.3               bit64_4.5.2             future_1.34.0           KEGGREST_1.44.1         shiny_1.9.1             highr_0.11              ROCR_1.0-11            
## [155] igraph_2.0.3            memoise_2.0.1           bslib_0.8.0             ggtree_3.12.0           fastmatch_1.1-4         bit_4.5.0               gson_0.1.0             
## [162] ape_5.8